#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from __future__ import annotations

import argparse
from pathlib import Path

import numpy as np
import pandas as pd


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--env", default="media_project/out/env_policy_city_year.csv")
    ap.add_argument("--wechat", default="media_project/out/gov_wechat_lowcarbon_city_year.csv")
    ap.add_argument("--out", default="media_project/out/env_policy_city_year_with_wechat.csv")
    args = ap.parse_args()

    env_path = Path(args.env)
    wechat_path = Path(args.wechat)
    if not env_path.exists():
        raise SystemExit(f"Missing env panel: {env_path}. Run build_env_policy_panel.py first.")
    if not wechat_path.exists():
        raise SystemExit(f"Missing wechat index: {wechat_path}. Run build_gov_wechat_topic_index.py + aggregate first.")

    env = pd.read_csv(env_path, encoding="utf-8-sig")
    wechat = pd.read_csv(wechat_path, encoding="utf-8-sig")

    env["city_code6"] = env["city_code6"].astype(str).str.zfill(6)
    wechat["city_code6"] = wechat["city_code6"].astype(str).str.zfill(6)
    env["year"] = pd.to_numeric(env["year"], errors="coerce").astype("Int64")
    wechat["year"] = pd.to_numeric(wechat["year"], errors="coerce").astype("Int64")

    keep = ["city_code6", "year", "docs_total", "docs_hit", "topic_intensity_year"]
    missing = [c for c in keep if c not in wechat.columns]
    if missing:
        raise RuntimeError(f"WeChat year file missing columns: {missing}. Available: {list(wechat.columns)}")

    merged = env.merge(wechat[keep], how="left", on=["city_code6", "year"])
    merged["wechat_rate_10k"] = 10_000.0 * merged["topic_intensity_year"]
    merged["log_wechat_hits"] = np.log1p(pd.to_numeric(merged["docs_hit"], errors="coerce"))
    merged["log_wechat_total"] = np.log1p(pd.to_numeric(merged["docs_total"], errors="coerce"))

    out_path = Path(args.out)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    merged.to_csv(out_path, index=False, encoding="utf-8-sig")
    print(f"Wrote {out_path} rows={len(merged)} wechat_coverage={merged['topic_intensity_year'].notna().mean():.3f}")


if __name__ == "__main__":
    main()

